import pandas as pd
import numpy as np
import datetime as dt
import plotly as ply
from sklearn.model_selection import ShuffleSplit as ss
from sklearn.linear_model import LogisticRegression as lr
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler as sts
from sklearn.pipeline import Pipeline as pl
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from __future__ import print_function as pr
from scipy import stats
##working object will be read latter on
#rainfall_original = pd.read_csv('weatherAus.csv')
#read in data set
rainfall = pd.read_csv('rainfall.csv', index_col=0)
rainfall.info()
print ('Shape of Data Set is', rainfall.shape)
#sort by least to greatest number of observations by feature
rainfall.count().sort_values()
#find the percentage of missing values for each feature
missing_perc = rainfall.isnull().sum()/len(rainfall)*100
missing_perc
#features with less than 80% data will be removed: Evaporation, Sunshine, Cloud9am, Cloud3pm
#also dropping date, location since both are not important to prediction model
rainfall = rainfall.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Date', 'Location'], axis = 1)
print ('Shape of Data Set is', rainfall.shape)
#Getting rid of all NaN values in data set
rainfall = rainfall.dropna(how='any')
print ('Shape of Data Set is', rainfall.shape)
#any Z-scores > 3 is detected as outliers and removed
from scipy import stats
z = np.abs(stats.zscore(rainfall._get_numeric_data()))
print(z)
rainfall= rainfall[(z < 3).all(axis=1)]
print ('Shape of Data Set is', rainfall.shape)
# we want to predict the X and y data as follows:
if 'RainTomorrow' in rainfall:
y = rainfall['RainTomorrow'].values # get the labels we want
del rainfall['RainTomorrow'] # get rid of the class label
x = rainfall.values # use everything else to predict!
# split our data into training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ss(n_splits=num_cv_iterations, test_size = 0.2)
print(cv_object)
import time
# iterate over the coefficients
column_names = rainfall.columns
weights = []
weights_array = []
scl_obj = sts()
t0=time.time()
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(x,y)):
scl_obj.fit(x[train_indices]) # find scalings for each column that make this zero mean and unit std
X_train_scaled = scl_obj.transform(x[train_indices]) # apply to training
X_test_scaled = scl_obj.transform(x[test_indices]) # apply those means and std to the test set (without snooping at the test set values)
# train the model just as before
lr_clf = lr(penalty='l2', C=0.05) # get object, the 'C' value is less (can you guess why??)
lr_clf.fit(X_train_scaled,y[train_indices]) # train object
y_hat = lr_clf.predict(X_test_scaled) # get test set precitions
acc = mt.accuracy_score(y[test_indices],y_hat)
conf = mt.confusion_matrix(y[test_indices],y_hat)
print("")
print('accuracy:', acc )
print(conf )
print ("Time to Run:", time.time()-t0)
# sort these attributes and spit them out
#zip_vars = zip(lr_clf.coef_.T,column_names) # combine attributes
#zip_vars = sorted(zip_vars)
zip_vars = pd.Series(lr_clf.coef_[0].T, index=column_names)
for name, coef in zip_vars.items():
print(name, 'has weight of', coef) # now print them out
weights.append(coef)
weights_array.append(weights)
weights = []
weights_array = np.array(weights_array)
ply.offline.init_notebook_mode() # run at the start of every notebook
mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])
error_y=dict(
type='data',
array=final_array['std'].values,
visible=True
)
graph1 = {'x': final_array.index,
'y': final_array['mean'].values,
'error_y':error_y,
'type': 'bar'}
fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Logistic Regression Weights, with error bars'}
ply.offline.iplot(fig)
weights = []
weights_array = []
t0=time.time()
# okay, so run through the cross validation loop and set the training and testing variable for one single iteration
for train_indices, test_indices in cv_object.split(x,y):
# I will create new variables here so that it is more obvious what
# the code is doing (you can compact this syntax and avoid duplicating memory,
# but it makes this code less readable)
X_train = x[train_indices]
y_train = y[train_indices]
X_test = x[test_indices]
y_test = y[test_indices]
X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test)
#train the model just as before
svm_clf = SVC(C=0.5, kernel='linear', degree=3, gamma='auto') # get object
svm_clf.fit(X_train_scaled, y_train) # train object
y_hat = svm_clf.predict(X_test_scaled) # get test set precitions
acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print("")
print('accuracy:', acc )
print(conf)
print ("Time to Run:", time.time()-t0)
# sort these attributes and spit them out
zip_vars = pd.Series(svm_clf.coef_[0],index=column_names) # combine attributes
for name, coef in zip_vars.items():
print(name, 'has weight of', coef) # now print them out
weights.append(coef)
weights_array.append(weights)
weights = []
weights_array = np.array(weights_array)
# look at the support vectors
print(svm_clf.support_vectors_.shape)
print(svm_clf.support_.shape)
print(svm_clf.n_support_ )
ply.offline.init_notebook_mode() # run at the start of every notebook
mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])
error_y=dict(
type='data',
array=final_array['std'].values,
visible=True
)
graph1 = {'x': final_array.index,
'y': final_array['mean'].values,
'error_y':error_y,
'type': 'bar'}
fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Support Vector Machines Weights, with error bars'}
ply.offline.iplot(fig)
# Now let's do some different analysis with the SVM and look at the instances that were chosen as support vectors
# now lets look at the support for the vectors and see if we they are indicative of anything
# grabe the rows that were selected as support vectors (these are usually instances that are hard to classify)
# make a dataframe of the training data
df_tested_on = rainfall.iloc[train_indices] # saved from above, the indices chosen for training
# now get the support vectors from the trained model
df_support = df_tested_on.iloc[svm_clf.support_,:]
df_support['RainTomorrow'] = y[svm_clf.support_] # add back in the 'Survived' Column to the pandas dataframe
rainfall['RainTomorrow'] = y # also add it back in for the original data
df_support.info()
# now lets see the statistics of these attributes
from pandas.tools.plotting import boxplot
# group the original data and the support vectors
df_grouped_support = df_support.groupby(['RainTomorrow'])
df_grouped = rainfall.groupby(['RainTomorrow'])
# plot KDE of Different variables
vars_to_plot = column_names
for v in vars_to_plot:
plt.figure(figsize=(10,4))
# plot support vector stats
plt.subplot(1,2,1)
ax = df_grouped_support[v].plot.kde()
plt.legend(['no rain','rained'])
plt.title(v+' (Instances chosen as Support Vectors)')
# plot original distributions
plt.subplot(1,2,2)
ax = df_grouped[v].plot.kde()
plt.legend(['no rain','rained'])
plt.title(v+' (Original)')